Load the cleaned data from
data_preparation.rmd. Assumes
koi_data.Rda contains the necessary columns.
Define target and predictor columns. Review this list carefully.
target_variable_name <- "koi_pdisposition"
predictor_cols <- c(
"koi_period", "koi_duration", "koi_depth", "koi_prad", "koi_teq",
"koi_insol", "koi_model_snr", "koi_steff", "koi_slogg", "koi_srad",
"koi_smass", "koi_impact", "koi_ror", "koi_srho", "koi_sma", "koi_incl",
"koi_dor", "koi_ldm_coeff1", "koi_ldm_coeff2", "koi_smet"
)
selected_cols <- c(target_variable_name, predictor_cols)
# Check which selected columns actually exist in the loaded data
selected_cols_exist <- selected_cols[selected_cols %in% names(koi_data_raw)]
missing_cols <- setdiff(selected_cols, selected_cols_exist)
if (length(missing_cols) > 0) {
print(paste("Warning: The following selected columns were not found:", paste(missing_cols, collapse = ", ")))
}
if (!target_variable_name %in% selected_cols_exist) {
stop(paste("Target variable", target_variable_name, "not found in the data!"))
}
# Subset the data
koi_data <- koi_data_raw[, selected_cols_exist]
print(paste("Selected", ncol(koi_data), "columns for analysis."))## [1] "Selected 21 columns for analysis."
Stratified split based on the target variable.
set.seed(42) # for reproducibility
train_indices <- createDataPartition(koi_data[[target_variable_name]], p = 0.8, list = FALSE)
train_data <- koi_data[train_indices, ]
test_data <- koi_data[-train_indices, ]
print(paste("Training set size:", nrow(train_data)))## [1] "Training set size: 6399"
## [1] "Test set size: 1599"
Apply simple median/mode imputation. Consider more advanced methods if NAs are numerous or potentially non-random.
# --- Impute Training Data ---
numeric_predictors_train <- names(train_data)[sapply(train_data, is.numeric)]
factor_predictors_train <- names(train_data)[sapply(train_data, function(x) is.character(x) || is.factor(x))]
factor_predictors_train <- setdiff(factor_predictors_train, target_variable_name)
# Impute numeric with median (calculated from training data)
train_medians <- list()
for (col in numeric_predictors_train) {
if (any(is.na(train_data[[col]]))) {
median_val <- median(train_data[[col]], na.rm = TRUE)
if (is.na(median_val)) {
median_val <- 0
print(paste("Warning: Train median NA for", col))
}
train_data[[col]][is.na(train_data[[col]])] <- median_val
train_medians[[col]] <- median_val # Store median for applying to test set
}
}
# Impute character/factor with mode (calculated from training data)
train_modes <- list()
for (col in factor_predictors_train) {
if (any(is.na(train_data[[col]]))) {
mode_val <- calculate_mode(train_data[[col]], na.rm = TRUE)
if (is.na(mode_val)) {
mode_val <- "Missing"
print(paste("Warning: Train mode NA for", col))
}
train_data[[col]][is.na(train_data[[col]])] <- mode_val
train_modes[[col]] <- mode_val # Store mode for applying to test set
}
}
# Remove rows with NA in target (TRAIN)
rows_before_na_target_train <- nrow(train_data)
train_data <- train_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(train_data) < rows_before_na_target_train) print("Removed rows with NA target in TRAIN")
# --- Impute Test Data (using values from training data) ---
for (col in names(train_medians)) {
if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
test_data[[col]][is.na(test_data[[col]])] <- train_medians[[col]]
}
}
# Impute character/factor with *training* mode
for (col in names(train_modes)) {
if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
test_data[[col]][is.na(test_data[[col]])] <- train_modes[[col]]
}
}
# Handle any remaining NAs in test set predictors if medians/modes couldn't be calculated or applied
na_check_test <- colSums(is.na(test_data %>% select(-all_of(target_variable_name))))
if (any(na_check_test > 0)) {
print("Warning: NAs still present in test set predictors after imputation:")
print(na_check_test[na_check_test > 0])
test_data <- test_data[complete.cases(test_data %>% select(-all_of(target_variable_name))), ]
print("Removed test rows with remaining NAs in predictors.")
}
# Remove rows with NA in target (TEST)
rows_before_na_target_test <- nrow(test_data)
test_data <- test_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(test_data) < rows_before_na_target_test) print("Removed rows with NA target in TEST")Ensure categorical predictors and the target variable are factors with consistent levels.
# Convert target variable to factor with desired levels and labels
train_data[[target_variable_name]] <- factor(train_data[[target_variable_name]],
levels = c("CANDIDATE", "FALSE POSITIVE"),
labels = c("candidate", "false_positive")
)
test_data[[target_variable_name]] <- factor(test_data[[target_variable_name]],
levels = c("CANDIDATE", "FALSE POSITIVE"),
labels = c("candidate", "false_positive")
)
print("Converted target variable to factor with levels:")## [1] "Converted target variable to factor with levels:"
## [1] "candidate" "false_positive"
# Define positive class based on the new factor levels
positive_class <- levels(train_data[[target_variable_name]])[2]
if (is.na(positive_class)) stop("Could not determine positive class level after factoring.")
print(paste("Positive class for evaluation:", positive_class))## [1] "Positive class for evaluation: false_positive"
Scale numeric predictors AFTER splitting and handling NAs/factors. Fit scaler only on training data.
numeric_predictors_final <- names(train_data)[sapply(train_data, is.numeric)]
# Check for zero variance columns before scaling
nzv <- nearZeroVar(train_data[, numeric_predictors_final], saveMetrics = TRUE)
cols_to_scale <- rownames(nzv)[!nzv$zeroVar]
if (length(cols_to_scale) < length(numeric_predictors_final)) {
print("Warning: Found zero-variance numeric columns, excluding from scaling:")
print(rownames(nzv)[nzv$zeroVar])
}
if (length(cols_to_scale) > 0) {
scaler <- preProcess(train_data[, cols_to_scale], method = c("center", "scale"))
train_data_scaled <- predict(scaler, train_data)
test_data_scaled <- predict(scaler, test_data)
print(paste("Scaled numeric predictors:", paste(cols_to_scale, collapse = ", ")))
} else {
print("No non-zero variance numeric predictors found to scale.")
train_data_scaled <- train_data
test_data_scaled <- test_data
}## [1] "Scaled numeric predictors: koi_period, koi_duration, koi_depth, koi_prad, koi_teq, koi_insol, koi_model_snr, koi_steff, koi_slogg, koi_srad, koi_smass, koi_impact, koi_ror, koi_srho, koi_sma, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, koi_smet"
Define the formula and fit the GLM.
glm_original_formula <- as.formula(paste(target_variable_name, "~ ."))
print(paste("Using formula:", deparse(glm_original_formula)))## [1] "Using formula: koi_pdisposition ~ ."
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.09986 0.09556 11.510 < 2e-16 ***
## koi_period 0.25772 0.21190 1.216 0.22390
## koi_duration 1.15093 0.09986 11.526 < 2e-16 ***
## koi_depth 2.19920 0.40141 5.479 4.28e-08 ***
## koi_prad 0.25948 0.38662 0.671 0.50212
## koi_teq 1.82665 0.11695 15.619 < 2e-16 ***
## koi_insol -0.44363 0.05698 -7.786 6.90e-15 ***
## koi_model_snr -0.04758 0.08904 -0.534 0.59313
## koi_steff -0.48874 0.15495 -3.154 0.00161 **
## koi_slogg 0.67690 0.08798 7.694 1.43e-14 ***
## koi_srad 0.01393 0.07724 0.180 0.85685
## koi_smass 0.44922 0.09916 4.530 5.89e-06 ***
## koi_impact 0.53968 0.10365 5.207 1.92e-07 ***
## koi_ror 2.55320 0.49385 5.170 2.34e-07 ***
## koi_srho 0.02435 0.03015 0.808 0.41923
## koi_sma -0.29034 0.24926 -1.165 0.24411
## koi_incl -0.41634 0.09885 -4.212 2.53e-05 ***
## koi_dor 0.58369 0.08070 7.233 4.74e-13 ***
## koi_ldm_coeff1 -1.05599 0.46039 -2.294 0.02181 *
## koi_ldm_coeff2 -0.98381 0.36463 -2.698 0.00697 **
## koi_smet -0.56487 0.05652 -9.995 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8870.4 on 6398 degrees of freedom
## Residual deviance: 5199.5 on 6378 degrees of freedom
## AIC: 5241.5
##
## Number of Fisher Scoring iterations: 9
Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE”:
koi_duration (1.15093 ***): Longer transit
durations are associated with a higher chance of being a
“false_positive”.
This might seem counterintuitive, as real planets have specific
durations. However, some types of false positives (e.g., grazing
eclipsing binaries, certain stellar variability) might also produce
long-duration events that get flagged.
koi_depth (2.19920 ***): Deeper transits are
associated with a higher chance of being a “false_positive”.
This is also potentially counterintuitive, as deep transits are strong
signals. However, very deep events might be more likely to be stellar
eclipsing binaries (another star, not a planet) rather than a planet,
especially if koi_ror is also large.
koi_teq (1.82665 ***): Higher equilibrium
temperatures are associated with a higher chance of being a
“false_positive”.
This could mean that objects with extremely high calculated T_eq are
more likely to be misclassified astrophysical phenomena or certain types
of eclipsing binaries rather than stable planetary candidates.
koi_slogg (0.67690 ***): Higher stellar surface
gravity (log₁₀(cm s⁻²)) is associated with a higher chance of being a
“false_positive”.
Stars with very high surface gravity are typically compact (e.g., some
types of dwarfs or even white dwarfs if they were in the sample).
Transits around these might have different characteristics or be
mimicked by phenomena that are ultimately classified as false positives.
Alternatively, if true candidates are mostly around typical
main-sequence stars, deviations from that slogg might
increase FP likelihood.
koi_smass (0.44922 ***): Higher stellar mass is
associated with a higher chance of being a “false_positive”.
Similar to koi_slogg, perhaps transits around more massive
stars in dataset are more prone to being astrophysical false positives
or are harder to distinguish.
koi_impact (0.53968 ***): A higher impact
parameter is associated with a higher chance of being a
“false_positive”.
This makes perfect sense, a higher impact parameter means that the
transit is more grazing. Grazing transits often have a V-shape (rather
than a U-shape) and are a common characteristic of eclipsing binaries or
other false positive scenarios.
koi_ror (2.55320 ***): A larger planet-to-star
radius ratio is associated with a higher chance of being a
“false_positive”.
This is a very strong indicator. While a larger planet is easier to
detect, a very large koi_ror often points towards an
eclipsing binary system (two stars orbiting each other) rather than a
planet transiting a star, as planets are generally much smaller than
their host stars. This is a classic false positive signature.
koi_dor (0.58369 ***): A larger planet-star
distance over star radius (distance at mid-transit normalized by stellar
radius) is associated with a higher chance of being a
“false_positive”.
koi_dor is related to the impact parameter and how far from
the star’s center the transit occurs, scaled by the star’s radius. A
larger value could indicate a more grazing transit geometry, consistent
with koi_impact also pointing towards
“false_positive”.
Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE” (i.e., INCREASED Likelihood of being a “CANDIDATE”):
koi_insol (-0.44363 ***): Higher insolation flux
is associated with a lower chance of being a “false_positive” (i.e.,
more likely a “candidate”).
This suggests that objects receiving more stellar flux (often closer
planets) are more likely to be genuine candidates in the dataset once
other factors are taken into account.
koi_steff (-0.48874 **): Higher stellar
effective temperature is associated with a lower chance of being a
“false_positive” (i.e., more likely a “candidate”).
Transits around hotter stars might be cleaner or less prone to certain
types of astrophysical mimics that get flagged as false positives when
occurring around cooler stars (which can have more stellar
activity).
koi_incl (-0.41634 ***): Higher inclination is
associated with a lower chance of being a “false_positive” (i.e., more
likely a “candidate”).
This is right. An inclination closer to 90 degrees (edge-on) is required
for a transit. If “higher” koi_incl means closer to 90
degrees, then a more edge-on system is less likely to be a false
positive and more likely a true candidate. The model is correctly
identifying that good, central (or near-central) transits are more
likely to be candidates.
koi_ldm_coeff1 (-1.05599 *),
koi_ldm_coeff2 (-0.98381 **): Specific trends in these
limb darkening coefficients are associated with a lower chance of being
a “false_positive”.
This implies that transit light curve shapes that are well-fit with
these particular limb darkening characteristics are more typical of
genuine planetary candidates than false positives.
koi_smet (-0.56487 ***): Higher stellar
metallicity is associated with a lower chance of being a
“false_positive” (i.e., more likely a “candidate”).
This aligns very well with astrophysical understanding since stars with
higher metallicity are known to be more likely to host planets
(especially gas giants). So, the model finds that higher metallicity
makes an object more likely to be a genuine candidate.
Predict on the test set and evaluate.
original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)
# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1]
original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
levels = test_target_levels
)
test_target_factor <- test_data_scaled[[target_variable_name]]
cm <- confusionMatrix(original_preds,
test_target_factor,
positive = positive_class
) # Ensure positive class is correctly specified
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 708 198
## false_positive 98 595
##
## Accuracy : 0.8149
## 95% CI : (0.795, 0.8336)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6294
##
## Mcnemar's Test P-Value : 8.702e-09
##
## Sensitivity : 0.7503
## Specificity : 0.8784
## Pos Pred Value : 0.8586
## Neg Pred Value : 0.7815
## Prevalence : 0.4959
## Detection Rate : 0.3721
## Detection Prevalence : 0.4334
## Balanced Accuracy : 0.8144
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_original <- roc(
response = test_target_factor,
predictor = original_probs,
levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))## [1] "GLM Original Features - AUC: 0.8882"
Plot ROC curve
plot(roc_original,
main = "ROC Curve - GLM Original Features",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Check for influential points using cook’s distance.
Find and remove all influential points with Cook’s distance > 0.5
cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))## [1] "Number of influential points to remove: 2"
# Remove all influential points
if (length(outlier_indices) > 0) {
train_data_scaled <- train_data_scaled[-outlier_indices, ]
print(paste("Removed", length(outlier_indices), "influential points"))
}## [1] "Removed 2 influential points"
Refit the GLM with the updated training data.
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.14156 0.08507 13.419 < 2e-16 ***
## koi_period 0.29864 0.21669 1.378 0.16814
## koi_duration 1.29810 0.10383 12.502 < 2e-16 ***
## koi_depth 0.77960 0.34790 2.241 0.02503 *
## koi_prad 0.01257 0.24641 0.051 0.95933
## koi_teq 1.99661 0.12031 16.595 < 2e-16 ***
## koi_insol -0.83265 0.09432 -8.828 < 2e-16 ***
## koi_model_snr -0.08312 0.08217 -1.012 0.31171
## koi_steff -0.51090 0.15898 -3.214 0.00131 **
## koi_slogg 0.75842 0.08950 8.474 < 2e-16 ***
## koi_srad 0.08568 0.07305 1.173 0.24087
## koi_smass 0.51646 0.10212 5.057 4.25e-07 ***
## koi_impact 0.40769 0.10601 3.846 0.00012 ***
## koi_ror 5.73441 0.50313 11.398 < 2e-16 ***
## koi_srho 0.02551 0.03030 0.842 0.39986
## koi_sma -0.40431 0.25554 -1.582 0.11361
## koi_incl -0.25444 0.10093 -2.521 0.01170 *
## koi_dor 0.65097 0.08326 7.819 5.35e-15 ***
## koi_ldm_coeff1 -1.01362 0.47104 -2.152 0.03141 *
## koi_ldm_coeff2 -0.95850 0.37285 -2.571 0.01015 *
## koi_smet -0.57885 0.05791 -9.996 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8867.7 on 6396 degrees of freedom
## Residual deviance: 5051.9 on 6376 degrees of freedom
## AIC: 5093.9
##
## Number of Fisher Scoring iterations: 8
Check outliers again.
Find and remove all influential points with Cook’s distance > 0.5
cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))## [1] "Number of influential points to remove: 1"
# Remove all influential points
if (length(outlier_indices) > 0) {
train_data_scaled <- train_data_scaled[-outlier_indices, ]
print(paste("Removed", length(outlier_indices), "influential points"))
}## [1] "Removed 1 influential points"
Refit the GLM with the updated training data.
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.13220 0.08512 13.301 < 2e-16 ***
## koi_period 0.27949 0.21694 1.288 0.19763
## koi_duration 1.30688 0.10405 12.561 < 2e-16 ***
## koi_depth 0.77501 0.34777 2.229 0.02585 *
## koi_prad 0.01662 0.26552 0.063 0.95008
## koi_teq 2.02544 0.12105 16.732 < 2e-16 ***
## koi_insol -1.15821 0.12615 -9.181 < 2e-16 ***
## koi_model_snr -0.08300 0.08220 -1.010 0.31262
## koi_steff -0.51294 0.15913 -3.223 0.00127 **
## koi_slogg 0.76720 0.08972 8.551 < 2e-16 ***
## koi_srad 0.09537 0.07306 1.305 0.19175
## koi_smass 0.51567 0.10259 5.026 5.00e-07 ***
## koi_impact 0.41457 0.10607 3.908 9.29e-05 ***
## koi_ror 5.74178 0.51005 11.257 < 2e-16 ***
## koi_srho 0.02530 0.03031 0.835 0.40399
## koi_sma -0.38305 0.25592 -1.497 0.13446
## koi_incl -0.24436 0.10075 -2.426 0.01529 *
## koi_dor 0.65519 0.08338 7.858 3.91e-15 ***
## koi_ldm_coeff1 -1.00526 0.47154 -2.132 0.03302 *
## koi_ldm_coeff2 -0.95198 0.37324 -2.551 0.01075 *
## koi_smet -0.58088 0.05801 -10.013 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8866.2 on 6395 degrees of freedom
## Residual deviance: 5045.5 on 6375 degrees of freedom
## AIC: 5087.5
##
## Number of Fisher Scoring iterations: 8
Check outliers again.
Now the model should be good.
Predict on the test set and evaluate.
original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)
# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels
original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
levels = test_target_levels
)
test_target_factor <- test_data_scaled[[target_variable_name]]
cm <- confusionMatrix(original_preds,
test_target_factor,
positive = positive_class
) # Ensure positive class is correctly specified
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 708 186
## false_positive 98 607
##
## Accuracy : 0.8224
## 95% CI : (0.8028, 0.8408)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6444
##
## Mcnemar's Test P-Value : 2.437e-07
##
## Sensitivity : 0.7654
## Specificity : 0.8784
## Pos Pred Value : 0.8610
## Neg Pred Value : 0.7919
## Prevalence : 0.4959
## Detection Rate : 0.3796
## Detection Prevalence : 0.4409
## Balanced Accuracy : 0.8219
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_original <- roc(
response = test_target_factor,
predictor = original_probs,
levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))## [1] "GLM Original Features - AUC: 0.8874"
Plot ROC curve
plot(roc_original,
main = "ROC Curve - GLM Original Features",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Calculates Variance Inflation Factors. Important when using original predictors. High VIF (> 5 or > 10) suggests multicollinearity might be inflating standard errors of coefficients, making them unstable/unreliable.
## [1] "VIF Values:"
## koi_period koi_duration koi_depth koi_prad koi_teq
## 45.365541 3.146117 2.405060 2.099924 5.875834
## koi_insol koi_model_snr koi_steff koi_slogg koi_srad
## 1.658242 1.735790 15.228120 4.938804 2.532314
## koi_smass koi_impact koi_ror koi_srho koi_sma
## 5.751579 2.101254 2.885427 1.112261 63.188275
## koi_incl koi_dor koi_ldm_coeff1 koi_ldm_coeff2 koi_smet
## 2.014036 4.972557 173.500591 113.851021 2.123956
# Check for high values
max_vif <- max(vif_values, na.rm = TRUE)
print(paste("Max VIF:", round(max_vif, 2)))## [1] "Max VIF: 173.5"
From the VIF values, we notice the following: 1. Physical Relationships: koi_period and koi_sma are expected to be highly correlated due to Kepler’s Third Law. 2. Parameter Dependencies: Limb darkening coefficients (koi_ldm_coeff1, koi_ldm_coeff2) are often highly correlated with each other and can also be correlated with stellar temperature (koi_steff). Stellar parameters (koi_steff, koi_teq, koi_smass) are often correlated among themselves.
We proceed by removing highly correlated predictors from the model.
# Remove highly correlated predictors
predictors_to_remove <- c("koi_sma", "koi_ldm_coeff2")
train_data_scaled <- train_data_scaled[, !names(train_data_scaled) %in% predictors_to_remove]
test_data_scaled <- test_data_scaled[, !names(test_data_scaled) %in% predictors_to_remove]
# Update predictor_cols
predictor_cols <- predictor_cols[!predictor_cols %in% predictors_to_remove]
# Refit the model with updated data
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.12710 0.08484 13.286 < 2e-16 ***
## koi_period -0.02010 0.07338 -0.274 0.78415
## koi_duration 1.26967 0.10099 12.573 < 2e-16 ***
## koi_depth 0.78093 0.34724 2.249 0.02452 *
## koi_prad 0.01503 0.26629 0.056 0.95498
## koi_teq 2.12831 0.10022 21.237 < 2e-16 ***
## koi_insol -1.20622 0.12232 -9.861 < 2e-16 ***
## koi_model_snr -0.07166 0.08178 -0.876 0.38091
## koi_steff -0.17450 0.08311 -2.100 0.03576 *
## koi_slogg 0.79903 0.08851 9.028 < 2e-16 ***
## koi_srad 0.11529 0.07186 1.604 0.10861
## koi_smass 0.46510 0.09684 4.803 1.57e-06 ***
## koi_impact 0.41540 0.10568 3.931 8.47e-05 ***
## koi_ror 5.69350 0.50833 11.200 < 2e-16 ***
## koi_srho 0.02612 0.03047 0.857 0.39119
## koi_incl -0.24948 0.10070 -2.477 0.01323 *
## koi_dor 0.62047 0.07931 7.823 5.14e-15 ***
## koi_ldm_coeff1 0.18397 0.06020 3.056 0.00224 **
## koi_smet -0.66341 0.04795 -13.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8866.2 on 6395 degrees of freedom
## Residual deviance: 5054.1 on 6377 degrees of freedom
## AIC: 5092.1
##
## Number of Fisher Scoring iterations: 8
Overall Model Changes & Fit:
koi_ldm_coeff2 is no
longer present. This change in model structure is likely contributing to
the improved AIC.Coefficients Interpretation (Positive Class: “false_positive”):
Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE”:
koi_duration (1.26967 ***): Longer transit durations are strongly associated with an increased likelihood of being a “false_positive”. Consistent with the previous model, slightly stronger effect.
koi_depth (0.78093 *): Deeper transits are associated with an increased likelihood of being a “false_positive”. Still positive, but the effect size and significance are notably reduced compared to the previous 2.19920 ***.
koi_teq (2.12831 ***): Higher equilibrium temperatures are very strongly associated with an increased likelihood of being a “false_positive”. Consistent, effect size increased from 1.82665 ***.
koi_slogg (0.79903 ***): Higher stellar surface gravity is very strongly associated with an increased likelihood of being a “false_positive”. Consistent, slightly stronger effect than 0.67690 ***.
koi_smass (0.46510 ***): Higher stellar mass is strongly associated with an increased likelihood of being a “false_positive”. Consistent, similar magnitude to 0.44922 ***.
koi_impact (0.41540 ***): A higher impact parameter (more grazing transit) is strongly associated with an increased likelihood of being a “false_positive”. Consistent, slightly smaller magnitude than 0.53968 ***.
koi_ror (5.69350 ***): A larger planet-to-star
radius ratio is very strongly associated with an increased likelihood of
being a “false_positive”. This effect has become much
stronger—previously 2.55320 ***—highlighting koi_ror as an
even more critical indicator of false positives.
koi_dor (0.62047 ***): Larger planet-star distance over star radius (at mid-transit) is very strongly associated with an increased likelihood of being a “false_positive”. Consistent, similar magnitude to 0.58369 ***.
koi_ldm_coeff1 (0.18397 **): This coefficient
has flipped sign and changed significance. Previously -1.05599 *, now
positive. Indicates that higher values of this coefficient are now
associated with false positives, possibly due to the removal of
koi_ldm_coeff2.
Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE”:
koi_insol (-1.20622 ***): Higher insolation flux is very strongly associated with a decreased likelihood of being a “false_positive”. Consistent, and effect size more than doubled from -0.44363 ***.
koi_steff (-0.17450 *): Higher stellar effective temperature is associated with a decreased likelihood of being a “false_positive”. Consistent, but effect size and significance reduced from -0.48874 **.
koi_incl (-0.24948 *): Higher inclination (closer to 90°, more edge-on) is associated with a decreased likelihood of being a “false_positive”. Consistent, but effect size and significance reduced from -0.41634 ***.
koi_smet (-0.66341 ***): Higher stellar metallicity is very strongly associated with a decreased likelihood of being a “false_positive”. Consistent, slightly stronger effect than -0.56487 ***.
The logistic regression model was fitted using scaled original numeric features (excluding the binary flag variables) to predict the koi_pdisposition.
Model Fit & Significant Predictors: The overall model shows a highly significant improvement over the null model (intercept only), indicated by the large drop in deviance (Null: 8866.2, Residual: 5045.5). Several predictors showed a statistically significant relationship (p < 0.05) with the target variable in this model: koi_duration, koi_depth, koi_teq, koi_insol, koi_steff, koi_slogg, koi_smass, koi_impact, koi_ror, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, and koi_smet. Notable predictors that were not statistically significant in this specific model include koi_period, koi_prad, koi_model_snr, koi_srad, koi_srho, and koi_sma. This could be due to collinearity with other included variables.
Residual Analysis (Non-linearity Check): Tests for non-linearity (curvature) were performed on the model’s residuals. Significant evidence (p < 0.05) of non-linear patterns remaining in the residuals was found for the following predictors: koi_depth, koi_teq, koi_insol, koi_slogg, koi_srad, koi_smass, koi_impact, koi_sma, koi_incl, koi_dor, and koi_smet. koi_period was borderline significant. This indicates that the assumption of a purely linear relationship between these predictors and the log-odds of the outcome is likely violated.
Conclusion: The GLM identifies many significant predictors for exoplanet disposition. However, the residual analysis strongly suggests that the model’s linear structure is insufficient to capture the underlying relationships for numerous variables. This indicates that model performance could potentially be improved by incorporating non-linear effects or using models better suited to capturing such complexities.
Let’s start by fitting a GAM model in order to use smooth functions s() for predictors showing non-linearity in residual analysis.
nonlinear_predictors_gam <- c(
"koi_depth", "koi_teq", "koi_insol", "koi_slogg",
"koi_srad", "koi_smass", "koi_impact", "koi_sma",
"koi_incl", "koi_dor", "koi_smet", "koi_period"
)
nonlinear_predictors_gam <- nonlinear_predictors_gam[nonlinear_predictors_gam %in% names(train_data_scaled)]
linear_predictors_gam <- setdiff(predictor_cols, nonlinear_predictors_gam)Prepare the formula for the GAM model.
smooth_terms <- if (length(nonlinear_predictors_gam) > 0) paste(paste0("s(", nonlinear_predictors_gam, ")"), collapse = " + ") else NULL
linear_terms <- if (length(linear_predictors_gam) > 0) paste(linear_predictors_gam, collapse = " + ") else NULL
gam_formula_str <- paste(target_variable_name, "~", paste(c(smooth_terms, linear_terms), collapse = " + "))
# Remove potential trailing/leading "+" if one group was empty
gam_formula_str <- gsub("\\+ $", "", gsub("^\\+ ", "", gsub(" \\+ \\+ ", " + ", gam_formula_str)))
gam_formula <- as.formula(gam_formula_str)
print(paste("Using GAM formula:", gam_formula_str))## [1] "Using GAM formula: koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) + s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) + s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) + koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror + koi_srho + koi_ldm_coeff1"
Fit the GAM model.
gam_model <- gam(gam_formula,
data = train_data_scaled,
family = binomial(link = "logit"),
method = "REML"
)
summary(gam_model)##
## Family: binomial
## Link function: logit
##
## Formula:
## koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) +
## s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) +
## s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) +
## koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror +
## koi_srho + koi_ldm_coeff1
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.94561 0.11063 8.548 < 2e-16 ***
## koi_duration 1.73728 0.14344 12.112 < 2e-16 ***
## koi_prad -0.28360 0.29322 -0.967 0.33345
## koi_model_snr -0.13978 0.08423 -1.659 0.09702 .
## koi_steff 0.01356 0.17555 0.077 0.93843
## koi_ror 6.64085 0.79541 8.349 < 2e-16 ***
## koi_srho -0.00168 0.03094 -0.054 0.95669
## koi_ldm_coeff1 0.20498 0.07090 2.891 0.00384 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(koi_depth) 2.721 3.404 6.681 0.0983 .
## s(koi_teq) 5.173 6.144 309.050 <2e-16 ***
## s(koi_insol) 1.000 1.000 0.004 0.9485
## s(koi_slogg) 4.254 5.476 53.962 <2e-16 ***
## s(koi_srad) 6.971 7.765 3.574 0.8206
## s(koi_smass) 1.002 1.004 0.513 0.4749
## s(koi_impact) 1.817 2.077 14.228 0.0305 *
## s(koi_incl) 2.289 2.865 3.743 0.3121
## s(koi_dor) 4.856 6.004 96.825 <2e-16 ***
## s(koi_smet) 5.843 6.826 97.286 <2e-16 ***
## s(koi_period) 2.696 3.438 4.490 0.2376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.544 Deviance explained = 46.3%
## -REML = 2458.4 Scale est. = 1 n = 6396
Plot of the splines effect:
par(mfrow = c(2, 3))
for (i in 1:11) {
plot(gam_model, select = i, shade = TRUE, shade.col = "lightblue")
abline(h = 0, lty = "dashed")
}As we can see from the plots, not all the splines are significant.
For example koi_period, koi_insol,
koi_smass, koi_period, koi_incl,
koi_depth and koi_impact are not significant.
We will try to remove them from the model.
gam_model <- update(gam_model, . ~ . - s(koi_period) - s(koi_insol) - s(koi_smass) - s(koi_period) - s(koi_incl) - s(koi_depth) - s(koi_impact))
summary(gam_model)##
## Family: binomial
## Link function: logit
##
## Formula:
## koi_pdisposition ~ s(koi_teq) + s(koi_slogg) + s(koi_srad) +
## s(koi_dor) + s(koi_smet) + koi_duration + koi_prad + koi_model_snr +
## koi_steff + koi_ror + koi_srho + koi_ldm_coeff1
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.00430 0.06785 14.801 < 2e-16 ***
## koi_duration 1.41230 0.09649 14.637 < 2e-16 ***
## koi_prad -0.19688 0.26623 -0.739 0.45961
## koi_model_snr -0.04401 0.07035 -0.626 0.53163
## koi_steff 0.23853 0.10635 2.243 0.02491 *
## koi_ror 7.67484 0.42627 18.004 < 2e-16 ***
## koi_srho 0.02748 0.02981 0.922 0.35658
## koi_ldm_coeff1 0.19630 0.07053 2.783 0.00538 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(koi_teq) 5.699 6.838 297.439 <2e-16 ***
## s(koi_slogg) 4.830 5.947 80.271 <2e-16 ***
## s(koi_srad) 6.249 7.163 6.793 0.49
## s(koi_dor) 8.097 8.784 209.363 <2e-16 ***
## s(koi_smet) 5.997 6.962 148.411 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.539 Deviance explained = 45.7%
## -REML = 2483.8 Scale est. = 1 n = 6396
Parametric Coefficients (Linear Effects)
These coefficients are interpreted like those in a standard GLM: they
represent the change in the log-odds of being a
"false_positive" for a one-unit increase in the (scaled)
predictor, holding other variables constant.
(Intercept) 1.00430 (***): The baseline log-odds
of a KOI being a "false_positive" is 1.00430 when all
linear predictors are zero and the smooth functions are at their average
effect. This is highly significant.
koi_duration 1.41230 (***): Longer transit
durations significantly increase the log-odds of being a
"false_positive". (Odds Ratio ≈ e^1.41230 ≈ 4.10). This
effect is consistent with and slightly stronger than in the previous
GLM.
koi_prad -0.19688 (p=0.45961): Planetary radius
is not a significant linear predictor of "false_positive"
status in this model.
koi_model_snr -0.04401 (p=0.53163): The transit signal-to-noise ratio is not a significant linear predictor.
koi_steff 0.23853 (*): Higher stellar effective
temperature (koi_steff) significantly increases the log-odds of being a
"false_positive". (Odds Ratio ≈ e^0.23853 ≈ 1.27).
Important Change: This is a flip in direction
compared to the previous GLM where higher koi_steff
decreased the odds of being a false positive. Now, in the context of
this GAM (with other variables modeled non-linearly), higher stellar
temperatures are associated with a higher likelihood of being a false
positive. This suggests that the relationship is complex and was being
influenced by the enforced linearity of other terms in the GLM.
koi_ror 7.67484 (***): The planet-to-star radius
ratio (koi_ror) has a very large and highly significant
positive effect on the log-odds of being a
"false_positive". (Odds Ratio ≈ e^7.67484 ≈ 2153). This
remains an extremely strong indicator, likely identifying eclipsing
binaries. The effect size is even larger than in the GLM.
koi_srho 0.02748 (p=0.35658): Fitted stellar density is not a significant linear predictor.
koi_ldm_coeff1 0.19630 (**): The first limb
darkening coefficient has a significant positive linear effect on the
log-odds of being a "false_positive". (Odds Ratio ≈
e^0.19630 ≈ 1.22). This is consistent with the GLM.
Approximate Significance of Smooth Terms (Non-Linear Effects)
For these variables, the relationship with the log-odds of being a
"false_positive" is not assumed to be linear. The
edf (estimated degrees of freedom) indicates the complexity
of the curve; an edf close to 1 suggests a near-linear
relationship, while higher values indicate more “wiggles” or
non-linearity.
koi_teq) has a highly significant and non-linear
relationship with the likelihood of being a
"false_positive". The edf of ~5.7 suggests a
fairly complex curve. The linear GLM showed a simple positive
association; this GAM indicates the true relationship is more nuanced
across the range of koi_teq.koi_slogg) also has a highly significant and
non-linear effect. The edf of ~4.8 indicates a complex
relationship, not just a simple linear trend as was suggested by the
GLM.koi_srad) is not significant. Even when
allowing for a non-linear relationship, koi_srad does not
significantly help predict "false_positive" status in this
model.koi_dor) has a highly
significant and very complex non-linear relationship (high
edf of ~8.1). This indicates that specific ranges or
patterns in koi_dor are associated with
"false_positive" likelihood in a way that a straight line
cannot capture.koi_smet) has a highly significant and non-linear effect.
The edf of ~6 suggests a complex curve. The GLM showed a
linear negative effect (higher metallicity → less likely FP). The GAM
suggests this relationship may not be consistently linear across all
metallicity values; for instance, the benefit might plateau or change at
very low/high metallicities.Summary and Key Insights from the GAM
koi_teq, koi_slogg, koi_dor,
koi_smet) have significant non-linear relationships with
the probability of a KOI being a "false_positive". This
means their influence is more complex than a simple positive or negative
trend. Visualizing these smooth terms is crucial.koi_ror) remains an overwhelmingly strong
linear predictor, with a massive coefficient, strongly indicating that
large koi_ror values are characteristic of
"false_positives" (likely eclipsing binaries).koi_steff (stellar
effective temperature) now increases the likelihood of a
"false_positive". This highlights how accounting for
non-linearities in other variables can change the apparent effect of
those kept linear. It suggests the relationship between
koi_steff and disposition is context-dependent.koi_duration and koi_ldm_coeff1 continue to
show a positive linear association with "false_positive"
likelihood.koi_prad,
koi_model_snr, koi_srho (linear terms), and
koi_srad (even as a smooth term) do not significantly
contribute to the model.Now we can see that all the splines are significant. Plot of the splines effect:
par(mfrow = c(2, 3))
for (i in 1:6) {
plot(gam_model, select = i, shade = TRUE, shade.col = "lightblue")
abline(h = 0, lty = "dashed")
}Predict on the test set and evaluate.
gam_probs <- predict(gam_model, newdata = test_data_scaled, type = "response")
gam_probs <- pmax(pmin(gam_probs, 1 - 1e-15), 1e-15)
gam_preds <- factor(ifelse(gam_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 695 165
## false_positive 111 628
##
## Accuracy : 0.8274
## 95% CI : (0.808, 0.8456)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6546
##
## Mcnemar's Test P-Value : 0.001422
##
## Sensitivity : 0.7919
## Specificity : 0.8623
## Pos Pred Value : 0.8498
## Neg Pred Value : 0.8081
## Prevalence : 0.4959
## Detection Rate : 0.3927
## Detection Prevalence : 0.4622
## Balanced Accuracy : 0.8271
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm_gam$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_gam <- roc(response = test_target_factor, predictor = gam_probs, levels = levels(test_target_factor))
print(paste("GAM - AUC:", round(auc(roc_gam), 4)))## [1] "GAM - AUC: 0.8937"
Plot ROC curve
plot(roc_gam,
main = "ROC Curve - GAM",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Let’s try adding interaction terms to the original GLM model.
interaction_subset <- c("koi_teq", "koi_slogg", "koi_ror", "koi_smet", "koi_impact")
# Ensure these exist in the data
interaction_subset <- interaction_subset[interaction_subset %in% names(train_data_scaled)]
# Get all other predictors to include as main effects
all_predictors <- setdiff(names(train_data_scaled), target_variable_name)
other_predictors <- setdiff(all_predictors, interaction_subset)Prepare the formula for the GLM model with interaction terms.
interaction_formula_str <- paste(
target_variable_name, "~",
paste0("(", paste(interaction_subset, collapse = " + "), ")^2"), # Pairwise interactions + main effects
"+",
paste(other_predictors, collapse = " + ")
) # Add main effects of others
interaction_formula <- as.formula(interaction_formula_str)
print(paste("Using interaction formula:", interaction_formula_str))## [1] "Using interaction formula: koi_pdisposition ~ (koi_teq + koi_slogg + koi_ror + koi_smet + koi_impact)^2 + koi_period + koi_duration + koi_depth + koi_prad + koi_insol + koi_model_snr + koi_steff + koi_srad + koi_smass + koi_srho + koi_incl + koi_dor + koi_ldm_coeff1"
Fit the GLM model.
glm_interaction_model <- glm(interaction_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_interaction_model)##
## Call:
## glm(formula = interaction_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.837e+14 9.182e+05 962444900 <2e-16 ***
## koi_teq 1.056e+15 1.991e+06 530666658 <2e-16 ***
## koi_slogg 4.326e+14 1.843e+06 234735460 <2e-16 ***
## koi_ror 1.465e+15 4.019e+06 364412212 <2e-16 ***
## koi_smet -3.529e+14 1.036e+06 -340529745 <2e-16 ***
## koi_impact 9.351e+13 2.577e+06 36282262 <2e-16 ***
## koi_period 2.376e+14 1.521e+06 156148218 <2e-16 ***
## koi_duration 4.758e+14 1.162e+06 409363536 <2e-16 ***
## koi_depth 1.864e+14 1.361e+06 137001244 <2e-16 ***
## koi_prad 2.407e+14 2.072e+06 116192510 <2e-16 ***
## koi_insol 3.147e+13 2.523e+06 12472902 <2e-16 ***
## koi_model_snr -4.602e+13 1.076e+06 -42785900 <2e-16 ***
## koi_steff -1.351e+14 1.779e+06 -75962347 <2e-16 ***
## koi_srad 1.993e+14 1.613e+06 123600517 <2e-16 ***
## koi_smass 2.297e+14 1.987e+06 115628797 <2e-16 ***
## koi_srho 3.575e+13 9.337e+05 38291236 <2e-16 ***
## koi_incl 4.230e+13 1.600e+06 26439186 <2e-16 ***
## koi_dor 1.143e+14 1.539e+06 74218505 <2e-16 ***
## koi_ldm_coeff1 1.398e+14 1.364e+06 102506756 <2e-16 ***
## koi_teq:koi_slogg 1.687e+14 5.822e+05 289846990 <2e-16 ***
## koi_teq:koi_ror -8.763e+14 2.987e+06 -293351502 <2e-16 ***
## koi_teq:koi_smet 2.522e+13 9.968e+05 25295452 <2e-16 ***
## koi_teq:koi_impact -3.046e+14 2.613e+06 -116592077 <2e-16 ***
## koi_slogg:koi_ror -3.951e+14 3.320e+06 -119029491 <2e-16 ***
## koi_slogg:koi_smet 1.152e+14 9.377e+05 122906001 <2e-16 ***
## koi_slogg:koi_impact -4.026e+13 2.231e+06 -18047890 <2e-16 ***
## koi_ror:koi_smet -4.314e+13 1.748e+06 -24684677 <2e-16 ***
## koi_ror:koi_impact -7.049e+13 1.159e+05 -608474447 <2e-16 ***
## koi_smet:koi_impact -1.282e+14 1.778e+06 -72140653 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8866.2 on 6395 degrees of freedom
## Residual deviance: 106184.6 on 6367 degrees of freedom
## AIC: 106243
##
## Number of Fisher Scoring iterations: 25
From the summary, we notice that this specific model output is not interpretable due to severe numerical problems, likely stemming from (quasi-)complete separation caused by the complexity of the interaction terms.
Predict on the test set and evaluate.
interaction_probs <- predict(glm_interaction_model, newdata = test_data_scaled, type = "response")
interaction_probs <- pmax(pmin(interaction_probs, 1 - 1e-15), 1e-15)
interaction_preds <- factor(ifelse(interaction_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
cm_interaction <- confusionMatrix(interaction_preds, test_target_factor, positive = positive_class)
cm_interaction## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 486 80
## false_positive 320 713
##
## Accuracy : 0.7498
## 95% CI : (0.7279, 0.7709)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5009
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8991
## Specificity : 0.6030
## Pos Pred Value : 0.6902
## Neg Pred Value : 0.8587
## Prevalence : 0.4959
## Detection Rate : 0.4459
## Detection Prevalence : 0.6460
## Balanced Accuracy : 0.7510
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm_interaction$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_glm_interaction <- roc(response = test_target_factor, predictor = interaction_probs, levels = levels(test_target_factor))
print(paste("GLM with interaction - AUC:", round(auc(roc_glm_interaction), 4)))## [1] "GLM with interaction - AUC: 0.751"
Plot ROC curve
plot(roc_glm_interaction,
main = "ROC Curve - GLM with interaction",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Preapare dataset and formula
train_formula_lasso <- as.formula(paste("~ .")) # Include all predictors
x_train <- model.matrix(train_formula_lasso, data = train_data_scaled[, all_predictors])[, -1] # Predictor matrix, remove intercept
y_train <- train_data_scaled[[target_variable_name]] # Target factorValue of lambda selected via cross validation:
## [1] 0.01330885
Selected model in lasso path plot:
the Lasso estimated coefficients corresponding to the best lambda are:
## (Intercept) koi_period koi_duration koi_depth koi_teq
## 0.3658801 0.1994419 0.3712789 0.8213715 0.9106747
## koi_model_snr koi_impact koi_ror koi_incl koi_dor
## 0.1879024 0.3371364 0.9173861 -0.6193954 0.1320083
## koi_smet
## -0.4631438
The LASSO (Least Absolute Shrinkage and Selection Operator) model
performs feature selection by shrinking some coefficients to zero. The
non-zero coefficients below are those selected by LASSO as important
predictors, using the lambda.min value from
cross-validation.
Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE” (Positive Coefficients):
(Intercept) 0.3658801: This is the
baseline log-odds of a KOI being a “FALSE POSITIVE” when all other
predictor variables in the model are zero.koi_ror (0.9173861): A larger
planet-to-star radius ratio. This is a very strong indicator. A very
large koi_ror often points towards an eclipsing binary
system rather than a planet.koi_teq (0.9106747): Higher
equilibrium temperature. Objects with extremely high calculated T_eq
might be misclassified astrophysical phenomena.koi_depth (0.8213715): Deeper
transits. Very deep events might be more likely to be stellar eclipsing
binaries.koi_duration (0.3712789): Longer
transit durations. Some types of false positives (e.g., grazing
eclipsing binaries) can produce long-duration events.koi_impact (0.3371364): Higher impact
parameter. This means the transit is more grazing, which is common in
false positive scenarios.koi_period (0.1994419): Longer orbital
periods. This suggests a slight increase in the likelihood of being a
false positive.koi_model_snr (0.1879024): Higher
model signal-to-noise ratio. While counterintuitive, extremely high SNR
events might sometimes be non-planetary or indicative of issues that
lead to a false positive classification.koi_dor (0.1320083): Larger
planet-star distance over star radius. This can indicate a more grazing
transit geometry.Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE” (i.e., INCREASED Likelihood of being a “CANDIDATE”) (Negative Coefficients):
koi_incl (-0.6193954): Higher orbital
inclination. An inclination closer to 90 degrees (edge-on transit) is
required for a true planetary transit, making it less likely to be a
false positive.koi_smet (-0.4631438): Higher stellar
metallicity. Stars with higher metallicity are known to be more likely
to host planets, making an object more likely to be a genuine
candidate.best_lambda <- cv_lasso_fit$lambda.min
x_test <- model.matrix(train_formula_lasso, data = test_data_scaled[, all_predictors])[, -1]
# Evaluate using the best lambda found by CV
lasso_probs <- predict(cv_lasso_fit, newx = x_test, s = best_lambda, type = "response")
lasso_probs <- as.vector(pmax(pmin(lasso_probs, 1 - 1e-15), 1e-15)) # Ensure stability
lasso_preds <- factor(ifelse(lasso_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 722 222
## false_positive 84 571
##
## Accuracy : 0.8086
## 95% CI : (0.7885, 0.8276)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6167
##
## Mcnemar's Test P-Value : 4.811e-15
##
## Sensitivity : 0.7201
## Specificity : 0.8958
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.7648
## Prevalence : 0.4959
## Detection Rate : 0.3571
## Detection Prevalence : 0.4096
## Balanced Accuracy : 0.8079
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_lasso <- roc(response = test_target_factor, predictor = lasso_probs, levels = levels(test_target_factor))
print(paste("Lasso - AUC:", round(auc(roc_lasso), 4)))## [1] "Lasso - AUC: 0.8816"
Plot ROC curve
plot(roc_lasso,
main = "ROC Curve - Lasso",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Value of lambda selected via cross validation:
## [1] 0.01800751
Selected model in ridge path plot:
the Lasso estimated coefficients corresponding to the best lambda are:
## (Intercept) koi_period koi_duration koi_depth koi_prad
## 0.39884348 0.19678576 0.46405878 0.67426574 0.44766663
## koi_teq koi_insol koi_model_snr koi_steff koi_slogg
## 1.01258905 -0.35870599 0.35635727 0.02635554 0.22080297
## koi_srad koi_smass koi_impact koi_ror koi_srho
## -0.04054393 0.15144990 0.46024229 0.65050228 0.03576134
## koi_incl koi_dor koi_ldm_coeff1 koi_smet
## -0.59836143 0.20858063 0.07115970 -0.52529285
Ridge regression is another regularization technique that helps to prevent overfitting by shrinking the coefficient estimates towards zero. Unlike LASSO, Ridge typically does not shrink coefficients to be exactly zero, so it tends to keep all predictors in the model. The interpretation of the coefficients is similar to GLM and LASSO.
Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE” (Positive Coefficients from the Ridge model):
(Intercept) 0.39884348: The baseline
log-odds of being a “FALSE POSITIVE” when other predictors are
zero.koi_teq (1.01258905): Higher
equilibrium temperature. This is the strongest positive predictor,
suggesting objects with very high T_eq are much more likely to be false
positives.koi_depth (0.67426574): Deeper
transits. A strong indicator for false positives.koi_ror (0.65050228): Larger
planet-to-star radius ratio. Also a strong indicator, often pointing to
eclipsing binaries.koi_duration (0.46405878): Longer
transit durations.koi_impact (0.46024229): Higher impact
parameter (more grazing transits).koi_prad (0.44766663): Larger
planetary radius. While larger planets are easier to detect, very large
apparent radii can be indicative of non-planetary objects.koi_model_snr (0.35635727): Higher
model signal-to-noise ratio.koi_slogg (0.22080297): Higher stellar
surface gravity.koi_dor (0.20858063): Larger
planet-star distance over star radius.koi_period (0.19678576): Longer
orbital periods.koi_smass (0.15144990): Higher stellar
mass.koi_ldm_coeff1 (0.07115970): The first
limb darkening coefficient. A small positive impact.koi_srho (0.03576134): Higher stellar
density. A very small positive impact.koi_steff (0.02635554): Higher stellar
effective temperature. A very small positive impact, which is
interesting as in other models it sometimes points to “CANDIDATE”. This
might be due to interactions or the nature of Ridge regularization.Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE” (i.e., INCREASED Likelihood of being a “CANDIDATE”) (Negative Coefficients from the Ridge model):
koi_incl (-0.59836143): Higher orbital
inclination. This is the strongest negative predictor, meaning more
edge-on transits strongly suggest a “CANDIDATE”.koi_smet (-0.52529285): Higher stellar
metallicity. Metal-rich stars are more likely to host planets.koi_insol (-0.35870599): Higher
insolation flux. Objects receiving more stellar flux are more likely to
be candidates.koi_srad (-0.04054393): Larger stellar
radius. A very small negative impact.best_lambda_ridge <- cv_ridge_fit$lambda.min
ridge_probs <- predict(cv_ridge_fit, newx = x_test, s = best_lambda_ridge, type = "response")
ridge_probs <- as.vector(pmax(pmin(ridge_probs, 1 - 1e-15), 1e-15)) # Ensure stability
ridge_preds <- factor(ifelse(ridge_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 717 214
## false_positive 89 579
##
## Accuracy : 0.8105
## 95% CI : (0.7904, 0.8294)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6205
##
## Mcnemar's Test P-Value : 1.051e-12
##
## Sensitivity : 0.7301
## Specificity : 0.8896
## Pos Pred Value : 0.8668
## Neg Pred Value : 0.7701
## Prevalence : 0.4959
## Detection Rate : 0.3621
## Detection Prevalence : 0.4178
## Balanced Accuracy : 0.8099
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC Curve
roc_ridge <- roc(response = test_target_factor, predictor = ridge_probs, levels = levels(test_target_factor))
print(paste("Ridge Regression - AUC:", round(auc(roc_ridge), 4)))## [1] "Ridge Regression - AUC: 0.8828"
Plot ROC curve
plot(roc_ridge,
main = "ROC Curve - Lasso",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)# Create data frame with model performance metrics
models_performance <- data.frame(
Model = c("GLM", "GAM", "GLM with Interactions", "Lasso", "Ridge"),
Accuracy = c(
cm$overall["Accuracy"],
cm_gam$overall["Accuracy"],
cm_interaction$overall["Accuracy"],
lasso_cm$overall["Accuracy"],
ridge_cm$overall["Accuracy"]
),
Sensitivity = c(
cm$byClass["Sensitivity"],
cm_gam$byClass["Sensitivity"],
cm_interaction$byClass["Sensitivity"],
lasso_cm$byClass["Sensitivity"],
ridge_cm$byClass["Sensitivity"]
),
Specificity = c(
cm$byClass["Specificity"],
cm_gam$byClass["Specificity"],
cm_interaction$byClass["Specificity"],
lasso_cm$byClass["Specificity"],
ridge_cm$byClass["Specificity"]
),
AUC = c(
auc(roc_original),
auc(roc_gam),
auc(roc_glm_interaction),
auc(roc_lasso),
auc(roc_ridge)
)
)
# Round numeric columns to 4 decimal places
models_performance[, 2:5] <- round(models_performance[, 2:5], 4)
# save to Rda
save(models_performance, file = "data/Rdas/models_performance.Rda")
# Display the results
models_performance